This is an R Markdown document containing code and plots for Abumazen et al. “Nasopharyngeal metatranscriptomics reveals host-pathogen signatures of pediatric sinusitis”.

Loading required R packages

library(reshape2)
library(pheatmap)
library(pROC)
library(RColorBrewer)
library(dplyr)
library(plyr)
library(vioplot)
library(Metrics)
library(DESeq2)
library(tximport)
library(EnhancedVolcano)
library(patchwork)
library(knitr)

1. Kraken2-based pathogen detection from RNA-seq data

Loading data files


Reading in the kraken2 data

tb = read.delim("d1-d5_fastp_dehost_krakenmerged_reads_normalized.txt",sep='\t',col.names=(c("X","Y","Z")))

matr = dcast(tb,Y ~ X,value.var = "Z")
rownames(matr) = matr[,1]
matr = matr[,-1]

for (i in 1:nrow(matr))
{   matr[i,which(is.na(matr[i,]))] = 0
}

matr = t(data.matrix(matr))

splitrownames = vector(length = nrow(matr))
for (i in 1:length(splitrownames))
{
    rownames(matr)[i] = strsplit(rownames(matr),"_")[[i]][1]
}

Reading in the metadata

metadata = read.csv("metadata.12.20.updated.csv",header=T)
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1

shared = intersect(rownames(matr),metadata[,1])
matr = matr[match(shared,rownames(matr)),]
metadata = metadata[match(shared,metadata[,1]),]

Bacterial predictions


Heatmap

annotation_row = metadata[,c(22,24,26)]
annotation_row = (annotation_row -2) * -1   #just to make it going 0 to 1
rownames(annotation_row) = metadata[,1]
colnames(annotation_row) = c("SPN", "HFLU", "MCAT")


cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
  "YlGnBu")))(99))

ann_colors = list(
    SPN = c("white", "light gray"),
    HFLU = c("white", "light gray"),
    MCAT = c("white", "light gray")
)


pheatmap(log(matr[,c(grep("Moraxella cat", colnames(matr)),grep("Streptococcus pne", colnames(matr)),grep("Haemophilus inf", colnames(matr)))] + 1, 10),annotation_row=annotation_row[,c(2,1,3)],cluster_cols = F, show_rownames = FALSE,col=cols,annotation_colors = ann_colors)

Box Plots

  group = metadata[,"mcat"]
    group <- mapvalues(group, 
          from=c("2","1"), 
          to=c("-","+"))
    group = as.factor(group)

    abund = log(matr[,"Moraxella catarrhalis"] + 1,10)
  p = ggplot(data.frame(mcat = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("mcat") +    theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge())

  p

  group = metadata[,"spn"]
        group <- mapvalues(group, 
          from=c("2","1"), 
          to=c("-","+"))
    group = as.factor(group)

    abund = log(matr[,"Streptococcus pneumoniae"] + 1,10)
    
  p = ggplot(data.frame(spn = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("spn") +     theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") + 
  geom_point(pch = 21, position = position_jitterdodge())

  p

  group = metadata[,"hflu"]
        group <- mapvalues(group, 
          from=c("2","1"), 
          to=c("-","+"))
    group = as.factor(group)

    abund = log(matr[,"Haemophilus influenzae"] + 1,10)
     
  p = ggplot(data.frame(hflu = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("hflu") +    theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge())

  p

boxplot(log((matr[which(metadata[,"mcat"] == 2),"Moraxella catarrhalis"]) + 1, 10),log((matr[which(metadata[,"mcat"] != 2),"Moraxella catarrhalis"]) + 1, 10),main="MCAT",names=c("-","+"),ylab="log10(RPM)")

boxplot(log((matr[which(metadata[,"spn"] == 2),"Streptococcus pneumoniae"]) + 1,10),log((matr[which(metadata[,"spn"] != 2),"Streptococcus pneumoniae"]) + 1, 10),main="SPN",names=c("-","+"),ylab="log10(RPM)")

boxplot(log((matr[which(metadata[,"hflu"] == 2),"Haemophilus influenzae"]) + 1,10),log((matr[which(metadata[,"hflu"] != 2),"Haemophilus influenzae"]) + 1, 10),main="HFLU",names=c("-","+"),ylab="log10(RPM)")

M. catarrhalis prediction accuracy

bacterial_detection_threshold = 1

values = unique(rev(sort(matr[,"Moraxella catarrhalis"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Moraxella catarrhalis"] >= k)
    TPR[i] = length(intersect(matches,which(metadata[,"mcat"] == 1))) / length(which(metadata[,"mcat"] == 1))
    FPR[i] = length(intersect(matches,which(metadata[,"mcat"] == 2))) / length(which(metadata[,"mcat"] == 2)) 
} 


prediction = matr[,"Moraxella catarrhalis"]
response = metadata[,"mcat"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.823844
auc_mcat = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("mcat","AUC = ",round(auc_mcat,2)))

matches = which(matr[,"Moraxella catarrhalis"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"mcat"] == 1))) / length(which(metadata[,"mcat"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"mcat"] == 2))) / length(which(metadata[,"mcat"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("mcat",tpr0 * 100,100-(fpr0 * 100)))
## [1] "mcat"             "93.2203389830508" "52.4271844660194"
sens_mcat = tpr0 * 100
spec_mcat = 100-(fpr0 * 100)

S. pneumoniae prediction accuracy

values = unique(rev(sort(matr[,"Streptococcus pneumoniae"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Streptococcus pneumoniae"] >= k)
    TPR[i] = length(intersect(matches,which(metadata[,"spn"] == 1))) / length(which(metadata[,"spn"] == 1))
    FPR[i] = length(intersect(matches,which(metadata[,"spn"] == 2))) / length(which(metadata[,"spn"] == 2)) 
} 

prediction = matr[,"Streptococcus pneumoniae"]
response = metadata[,"spn"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.8915326
auc_spn = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("spn","AUC = ",round(auc_spn,2)))

matches = which(matr[,"Streptococcus pneumoniae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"spn"] == 1))) / length(which(metadata[,"spn"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"spn"] == 2))) / length(which(metadata[,"spn"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("spn",tpr0 * 100,100-(fpr0 * 100)))
## [1] "spn"              "87.1428571428571" "81.4569536423841"
sens_spn = tpr0 * 100
spec_spn = 100-(fpr0 * 100)

H. influenzae prediction accuracy

values = unique(rev(sort(matr[,"Haemophilus influenzae"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Haemophilus influenzae"] >= k)
    TPR[i] = length(intersect(matches,which(metadata[,"hflu"] == 1))) / length(which(metadata[,"hflu"] == 1))
    FPR[i] = length(intersect(matches,which(metadata[,"hflu"] == 2))) / length(which(metadata[,"hflu"] == 2)) 
} 

prediction = matr[,"Haemophilus influenzae"]
response = metadata[,"hflu"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.9549669
auc_hflu = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("hflu","AUC = ",round(auc_hflu,2)))

matches = which(matr[,"Haemophilus influenzae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"hflu"] == 1))) / length(which(metadata[,"hflu"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"hflu"] == 2))) / length(which(metadata[,"hflu"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("hflu",tpr0 * 100,100-(fpr0 * 100)))
## [1] "hflu"             "95.7142857142857" "86.7549668874172"
sens_hflu = tpr0 * 100
spec_hflu = 100-(fpr0 * 100)


accuracies = cbind(sens = c(mcat = sens_mcat,spn = sens_spn,hflu = sens_hflu),spec = c(mcat = spec_mcat,spec_spn,spec_hflu),auc = c(auc_mcat, auc_spn, auc_hflu))

kable(accuracies)
sens spec auc
mcat 93.22034 52.42718 0.8238440
spn 87.14286 81.45695 0.8915326
hflu 95.71429 86.75497 0.9549669

Viral predictions


Heatmap

viral_reads = matrix(ncol=12,nrow = nrow(matr))

colnames(viral_reads) = c("INFA","INFB","INFC","MPV","RSV","HRV","PIV1","PIV2","PIV3","PIV4","ADV","EVD68")

rownames(viral_reads) = rownames(matr)

viral_reads[,"INFA"] = matr[,"Influenza A virus"]

viral_reads[,"INFB"] = matr[,"Influenza B virus"]

viral_reads[,"INFC"] = matr[,"Influenza C virus"]

viral_reads[,"MPV"] = matr[,"Human metapneumovirus"]

viral_reads[,"RSV"] = matr[,"Respiratory syncytial virus"] + matr[,"Human orthopneumovirus"]

viral_reads[,"HRV"] = matr[,"Rhinovirus A"] + matr[,"Rhinovirus B"] + matr[,"Rhinovirus C"]

viral_reads[,"PIV1"] = matr[,"Human respirovirus 1"]

viral_reads[,"PIV2"] = matr[,"Human orthorubulavirus 2"]

viral_reads[,"PIV3"] = matr[,"Human respirovirus 3"]

viral_reads[,"PIV4"] = matr[,"Human orthorubulavirus 4"]

viral_reads[,"ADV"] = matr[,"Human mastadenovirus B"] + matr[,"Human mastadenovirus C"]

viral_reads[,"EVD68"] = matr[,"Enterovirus D"]


tomap = log(viral_reads + 1, 10)

annotation = metadata[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
rownames(annotation) = metadata[,1]
colnames(annotation) = rev(colnames(tomap))

cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
  "YlGnBu")))(99))

for (i in 1:ncol(annotation))
{
    annotation[,i] = as.factor(findInterval(annotation[,i],c(10,20,30,40)))

}

ann_colors = list(
    INFA = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    INFB = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    INFC = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    MPV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    RSV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    HRV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV1 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV2 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV3 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV4 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    ADV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    EVD68 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black")
)
pheatmap(tomap,
         annotation_row=annotation,
         annotation_colors = ann_colors,
         cluster_cols = F,
         show_rownames = F,
         col=cols
)

Box plots

viruses = c("INFA","INFB","INFC","MPV","RSV","HRV","PIV1","PIV2","PIV3","PIV4","ADV","EVD68")


ctthreshold = 35

annotation = metadata[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
rownames(annotation) = metadata[,1]
colnames(annotation) = rev(colnames(tomap))

annotation[annotation >= ctthreshold] = NA

annotation[annotation < ctthreshold] = 1

annotation[is.na(annotation)] = 0

par(mfrow=c(4,3), mar=c(4, 4, 2.5, 1.5))

sens_scores = vector(length = 12) # for each virus
spec_scores = vector(length = 12) # for each virus

for (viralcount in 1:12)
{   
        virus = viruses[viralcount]

        values = unique(rev(sort(viral_reads[,virus])))
        TPR = vector(length = length(values)) 
        FPR = vector(length = length(values)) 
        for (i in 1:length(values))
        {   k = values[i] 
            matches = which(viral_reads[,virus] >= k)
            TPR[i] = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
            FPR[i] = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 
        } 
        #plot(FPR,TPR,type="l",main=virus)

        boxplot(tomap[which(annotation[,virus] == 0),virus],tomap[which(annotation[,virus] == 1),virus],main=virus)

        matches = which(viral_reads[,virus] > 0)
        tpr0 = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
        fpr0 = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 

        # prints out the sensitivity and the specificity
        #print(c(virus,tpr0 * 100,100-(fpr0 * 100)))

        sens_scores[viralcount] = as.numeric(tpr0 * 100)
        spec_scores[viralcount] = as.numeric(100 - (fpr0 * 100))
}

kable(cbind(virus = viruses,sens = sens_scores,spec = spec_scores))
virus sens spec
INFA 100 93.6585365853659
INFB 100 97.1563981042654
INFC 33.3333333333333 95.8139534883721
MPV 100 90.8653846153846
RSV 90 92.4170616113744
HRV 73.469387755102 77.2357723577236
PIV1 100 94.4954128440367
PIV2 100 98.6175115207373
PIV3 100 91.4691943127962
PIV4 90.9090909090909 91.4285714285714
ADV 44.4444444444444 96.551724137931
EVD68 100 90

Optional: exploring prediction accuracy across multiple CT thresholds

avg_sensitivity = vector(length = 45)
avg_specificity = vector(length = 45)


for (count in 1:45) #this is the ct threshold

{
    ctthreshold = count

    annotation = metadata[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
    rownames(annotation) = metadata[,1]
    colnames(annotation) = rev(colnames(tomap))

    annotation[annotation >= ctthreshold] = NA

    annotation[annotation < ctthreshold] = 1

    annotation[is.na(annotation)] = 0

    par(mfrow=c(4,3))

    sens_scores = vector(length = 12) # for each virus
    spec_scores = vector(length = 12) # for each virus

    for (viralcount in 1:12)
    {   
        virus = viruses[viralcount]

        values = unique(rev(sort(viral_reads[,virus])))
        TPR = vector(length = length(values)) 
        FPR = vector(length = length(values)) 
        for (i in 1:length(values))
        {   k = values[i] 
            matches = which(viral_reads[,virus] >= k)
            TPR[i] = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
            FPR[i] = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 
        } 
        plot(FPR,TPR,type="l",main=virus)

        matches = which(viral_reads[,virus] > 0)
        tpr0 = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
        fpr0 = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 

        # prints out the sensitivity and the specificity
        print(c(virus,tpr0 * 100,100-(fpr0 * 100)))

        sens_scores[viralcount] = as.numeric(tpr0 * 100)
        spec_scores[viralcount] = as.numeric(100 - (fpr0 * 100))

    }

    print(paste(count,mean(sens_scores)))
    print(paste(count,mean(spec_scores)))

    avg_sensitivity[count] = mean(sens_scores)
    avg_specificity[count] = mean(spec_scores)

}

Exploratory analysis of newly identified pathogens


First, we will produce a heatmap of all kraken2-predicted taxa that contain “virus” in their organism names.

allViruses = matr[,grep("virus",colnames(matr))]
pheatmap(t(log(allViruses[,which(apply(allViruses,2,max) >= 10)] + 1,10)),fontsize_col=5)

Calculation of total bacterial pathogen and viral abundance


Here, we will calculate the total relative abundance of the three bacterial sinusitis pathogens (HFLU, MCAT, SPN), as well as the total relative abundance of respiratory viruses. This is done by summing their log10(RPM) values.

bacterial_pathogens = c("Acinetobacter baumannii","Acinetobacter johnsonii","Bordetella parapertussis","Chlamydia pneumoniae","Eikenella corrodens","Enterobacter cloacae","Haemophilus influenzae","Haemophilus parainfluenzae","Klebsiella aerogenes","Klebsiella pneumoniae","Moraxella catarrhalis","Neisseria meningitidis","Pseudomonas aeruginosa","Streptococcus anginosus","Staphylococcus aureus","Streptococcus agalactiae","Streptococcus pneumoniae","Streptococcus pyogenes")

viral_pathogens = c("Respiratory syncytial virus","Enterovirus D","Human coronavirus 229E","Human coronavirus HKU1","Betacoronavirus 1","Human coronavirus NL63","Human mastadenovirus B","Human mastadenovirus C","Human metapneumovirus","Human orthopneumovirus","Human orthorubulavirus 2","Human orthorubulavirus 4","Human respirovirus 1","Human respirovirus 3","Influenza A virus","Influenza B virus","Influenza C virus","Rhinovirus A","Rhinovirus B","Rhinovirus C")

three_bacteria = c("Haemophilus influenzae", "Moraxella catarrhalis", "Streptococcus pneumoniae")



bacPathAbundance = apply(matr[,three_bacteria],1,sum)  # sum of their individual RPMs

viralPathAbundance = apply(matr[,viral_pathogens],1,sum)   # sum of their individual RPMs


metadata = metadata %>% mutate(threebac_quantile = ntile(bacPathAbundance, 10), viral_quantile = ntile(viralPathAbundance, 10))


metadata$high_bac<- ifelse(metadata$threebac_quantile >=7, 1, 0)
metadata$high_viral<- ifelse(metadata$viral_quantile >=8, 1, 0)

metadata$high_pathogens <- ifelse(metadata$threebac_quantile >=7 & metadata$viral_quantile >=8,3, 
                           ifelse(metadata$threebac_quantile >=7, 2,
                           ifelse(metadata$viral_quantile >=8, 1, 0)))

2. Gene Expression Analyses


Loading data and formatting metadata

genesymbols = read.delim("host-expression/gencode.v39.metadata.HGNC")

metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
metadata[,"batch"] = as.factor(metadata[,"batch"])

metadata[,"bv_both_none"] = as.factor(metadata[,"bv_both_none"])

metadata[metadata[,"cum_pathogn"] > 1,"cum_pathogn"] = 1

metadata[,"cum_pathogn"] = as.factor(metadata[,"cum_pathogn"])

metadata[,"sex"] = as.factor(metadata[,"sex"])

metadata$age_scaled = scale(metadata$Age.in.years)

DESEQ2 pairwise comparison of bacteria-only vs virus-only group

onlyvirusonlybacteria_samples = metadata[which((metadata$bv_both_none == 1) | (metadata$bv_both_none == 2)),]

onlyvirusonlybacteria_samples = onlyvirusonlybacteria_samples %>% arrange(cum_pathogn)

subsetfiles <- file.path("host-expression/quants", onlyvirusonlybacteria_samples$Filename, "quant.sf")
subset_txi.salmon <- tximport(subsetfiles, type = "salmon", tx2gene = genesymbols)

dds <- DESeqDataSetFromTximport(subset_txi.salmon, onlyvirusonlybacteria_samples, ~ batch + sex + age_scaled + cum_pathogn)

dds <- DESeq(dds)

res <- results(dds)

summary(res)
## 
## out of 31126 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2944, 9.5%
## LFC < 0 (down)     : 1891, 6.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 8333, 27%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
pairwise_genes = which(res$padj < 0.001)

bac_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange > 0))

viral_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange < 0))

Number of DEGs detected:

There are 548 bacterial upDEGs and 273 viral upDEGs.

Volcano Plot

EnhancedVolcano(res,
  lab = rownames(res),
  x = 'log2FoldChange',
  y = 'pvalue',
  pCutoff = 0.001,
  FCcutoff = 0,
  xlim = c(-7, 7),
    ylim=c(-1,27),
  pointSize = 1.5,
  labSize = 2.5,
  title = 'DESeq2 results',
  subtitle = 'Differential expression',
  caption = 'p-value cutoff, 0.001',
  legendPosition = "none",
  legendLabSize = 14,
  col = c('grey30', 'light gray', 'royalblue', 'red2'),
  colAlpha = 0.9,
  drawConnectors = FALSE,
  widthConnectors = 0.5)

Host-response / pathogen abundance correlations

Read in full set of files, including those with NO pathogens and BOTH (virus and bacteria)

files <- file.path("host-expression/quants", metadata$Filename, "quant.sf")
txi.salmon <- tximport(files, type = "salmon", tx2gene = genesymbols)
pdds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~ batch + sex + age_scaled + cum_pathogn)
prld <- varianceStabilizingTransformation(pdds, blind = TRUE)

Jitter plots of expression levels for selected genes (previous biomarkers)

These are plots of host gene expression per clinical group

pathOrNot = as.numeric(metadata[,"bv_both_none"]) # using clinical categories

genelist = c("CXCL10", "PRF1", "IFI27", "CCL8", "PTGS2", "S100A9", "PLAUR", "TNFAIP3","IL1B")
for (i in 1:length(genelist)) {
  gene = genelist[i] 
  print(gene)
  p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
  geom_boxplot(outlier.shape=NA) +
  ggtitle(gene) +
 theme(axis.title = element_blank(), legend.position="none") + 
    labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
  labs(fill = "Infection type")
  assign(paste0("p", i), p)
}
## [1] "CXCL10"
## [1] "PRF1"
## [1] "IFI27"
## [1] "CCL8"
## [1] "PTGS2"
## [1] "S100A9"
## [1] "PLAUR"
## [1] "TNFAIP3"
## [1] "IL1B"
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + theme(legend.position="right") + plot_layout(nrow = 3)

Setting up data for sorted heatmaps

#setting up a temp matrix with Z-score scaled expression levels
temp = t(scale(t(assay(prld[,])))) 
temp[temp < -5] = -5
temp[temp > +5] = +5
temp[is.na(temp)] = 0

annotation_col = data.frame(
    Category = metadata$high_pathogens, 
            Viral_pathogen_abundance = metadata[,"viral_quantile"],
     Bacterial_pathogen_abundance = metadata[,"threebac_quantile"],
     Symptom_severity_score = metadata[,"PRSS"],
     Days_with_cold = metadata[,"days_with_cold"],
     Infection = factor(metadata[,"bv_both_none"], labels = c("None","Bacterial", "Viral", "Both"))
     )

Viral sorted heatmap

heatmap <- pheatmap(temp[viral_upDEGs,order(viralPathAbundance)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),

    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Bacterial sorted heatmap

heatmap <- pheatmap(temp[bac_upDEGs,order(bacPathAbundance)], 

annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "green", "Yes" = "red")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Combined heatmap

heatmap1 <- pheatmap(temp[viral_upDEGs,order(metadata$high_pathogen)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

heatmap2 <- pheatmap(temp[bac_upDEGs,order(metadata$high_pathogen)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Jitter plots of host response scores vs pathogen abundance

viralResponseScore = apply(temp[viral_upDEGs,],2,mean)

bacterialResponseScore = apply(temp[bac_upDEGs,],2,mean)


group = as.numeric(metadata[,"threebac_quantile"])

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Bacterial pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") +
    theme(legend.position = "none")
  
  p

group = as.numeric(metadata[,"viral_quantile"])

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Viral pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)") +
    theme(legend.position = "none")
  p

group = as.numeric(metadata[,"high_pathogens"])

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") 
  p

group = as.numeric(metadata[,"high_pathogens"])

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
  
  
  p

Individual gene expression levels across groups

pathOrNot = as.numeric(metadata[,"high_pathogens"]) # using categories "Both low" "Viral high" "Bacterial high" "Both high"

genelist = c("PTGS2", "S100A9", "PLAUR", "TNFAIP3", "SERPINA2", "CXCL2","IL1B", "CXCL11","IFNL1","CXCL10", "PRF1", "IFI27", "CCL8","OAS2")
for (i in 1:length(genelist)) {
  gene = genelist[i] 
  p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
  geom_boxplot(outlier.shape=NA) +
  ggtitle(gene) +
  theme(axis.title = element_blank(), legend.position="none") +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
  labs(fill = "Infection type")
  assign(paste0("p", i), p)
}
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10 + p11 + p12 + p13 + p14 + plot_layout(nrow = 2)

Testing the specificity of the bacterial host response for SPN/HFLU/MCAT

Here, we will compute the correlation between the bacterial host response score and the total bacterial pathogen abundance (MCAT + SPN + HFLU), and compare this to 1000 randomly samples selections of three bacterial species from the data.

abundant_bacteria = setdiff(which(apply(matr,2,max) >= 10),grep("virus",colnames(matr)))
cors = vector(length = 1000)
for (i in 1:1000)
{

        randSetOfThree = sample(abundant_bacteria,3)
        threeBacAbundance = apply(matr[,randSetOfThree],1,sum)  # sum of their individual RPMs
        #ntile(threeBacAbundance, 10)
        cors[i] = cor(bacterialResponseScore,log(threeBacAbundance + 1,10))
}

hist(cors,xlim=c(-1,1),breaks=100)
abline(v = cor(bacterialResponseScore, log(bacPathAbundance + 1,10)))

ROC curves for predicting high/low pathogen abundance based on host response

Viral ROC

response = vector(length = nrow(metadata))
response[] = 0
response[which(metadata$high_pathogen == 1 | metadata$high_pathogen == 3)] = 1

prediction = viralResponseScore
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.8752688

Bacterial ROC

response = vector(length = nrow(metadata))
response[] = 0
response[which(metadata$high_pathogen == 2 | metadata$high_pathogen == 3)] = 1

prediction = bacterialResponseScore
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.7663192